Consensi check¶

In [1]:
# library import
from utils import *

RepeatModeler2¶

In [2]:
seq_dict = cons_parser("data/consensi/RM2_consensi.fa.classified")
length_hist(seq_dict)
Stats(seq_dict, classification=True)
Out[2]:

Summary

Number of consensi:15384
Longest sequence:14783
Shortest sequence:29
Average length:2262.0
LTR12565
tRNA14
LINE513
DNA147
RC34
SINE22
rRNA8
Satellite5
Unknown2069
Simple_repeat7
In [3]:
reads_hist("data/bam/RM2_consensi_check.sam", "data/consensi/RM2_consensi.fa.classified")

EDTA¶

In [4]:
seq_dict = cons_parser("data/consensi/EDTA_consensi.fa")
length_hist(seq_dict)
Stats(seq_dict, True)
Out[4]:

Summary

Number of consensi:16191
Longest sequence:16685
Shortest sequence:80
Average length:1972.0
DNA4725
LTR10166
MITE1300
In [5]:
reads_hist("data/bam/EDTA_consensi_check.sam", "data/consensi/EDTA_consensi.fa")

MITE-Tracker¶

In [6]:
seq_lens = []
for seq_record in SeqIO.parse("data/consensi/MITE_consensi.fa", "fasta"):
    seq_id = seq_record.id
    seq_lens.append(len(seq_record.seq))

trace = go.Histogram(x=seq_lens)
fig = go.Figure(data=trace, layout=go.Layout(title="MITE-Tracker", xaxis=dict(title="sequence length"), yaxis=dict(title="count")))
fig.show()

Stats(seq_lens, False)
Out[6]:

Summary

Number of consensi:10863
Longest sequence:800
Shortest sequence:49
Average length:289.0
In [7]:
reads_hist("data/bam/MITE_consensi_check.sam", "data/consensi/MITE_consensi.fa")

RepeatMasker¶

Before trimming¶

In [8]:
seq_lens = []
for seq_record in SeqIO.parse("data/consensi/RM_consensi.fa", "fasta"):
    seq_id = seq_record.id
    seq_lens.append(len(seq_record.seq))

trace = go.Histogram(x=seq_lens)
fig = go.Figure(data=trace, layout=go.Layout(title="RepeatMasker", xaxis=dict(title="sequence length"), yaxis=dict(title="count")))
fig.show()

Stats(seq_lens, False)
Out[8]:

Summary

Number of consensi:12214
Longest sequence:43821
Shortest sequence:30
Average length:2197.0

After trimming¶

sed -e '/@/,+1d;/SAT/,+1d;/^\s*$/d' RM_consensi.fa > RM_consensi.fa.trimmed
  • remove sequence that have "@" in the headers (simple repeats);
  • remove satellites and microsatellites ("SAT" pattern).
In [9]:
seq_lens = []
for seq_record in SeqIO.parse("data/consensi/RM_consensi.fa.trimmed", "fasta"):
    seq_id = seq_record.id
    seq_lens.append(len(seq_record.seq))
    if len(seq_record.seq) == 0:
        print(seq_id)

trace = go.Histogram(x=seq_lens)
fig = go.Figure(data=trace, layout=go.Layout(title="RepeatMasker", xaxis=dict(title="sequence length"), yaxis=dict(title="count")))
fig.show()

Stats(seq_lens, False)
Out[9]:

Summary

Number of consensi:12120
Longest sequence:43821
Shortest sequence:30
Average length:2213.0